Multimodal Representation and Retrieval
Visual Adaptive Prompting for Compositional Zero-Shot Learning-
[pdf]
[arXiv]
[bibtex]@InProceedings{Stein_2025_ICCV, author = {Stein, Kyle and Mahyari, Andrew Arash and Francia, Guillermo and El-Sheikh, Eman}, title = {Visual Adaptive Prompting for Compositional Zero-Shot Learning}, booktitle = {Proceedings of the IEEE/CVF0International0Conference0on0Computer0Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4137-4146} }
Smart Routing for Multimodal Video Retrieval: When to Search What-
[pdf]
[arXiv]
[bibtex]@InProceedings{Rosa_2025_ICCV, author = {Rosa, Kevin Dela}, title = {Smart Routing for Multimodal Video Retrieval: When to Search What}, booktitle = {Proceedings of the IEEE/CVF0International0Conference0on0Computer0Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4118-4126} }
Refining Skewed Perceptions in Vision-Language Contrastive Models through Visual Representations-
[pdf]
[arXiv]
[bibtex]@InProceedings{Dai_2025_ICCV, author = {Dai, Haocheng and Joshi, Sarang}, title = {Refining Skewed Perceptions in Vision-Language Contrastive Models through Visual Representations}, booktitle = {Proceedings of the IEEE/CVF0International0Conference0on0Computer0Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4071-4080} }
Chrono: A Simple Blueprint for Representing Time in MLLMs-
[pdf]
[arXiv]
[bibtex]@InProceedings{Meinardus_2025_ICCV, author = {Meinardus, Boris and Rodriguez, Hector G. and Batra, Anil and Rohrbach, Anna and Rohrbach, Marcus}, title = {Chrono: A Simple Blueprint for Representing Time in MLLMs}, booktitle = {Proceedings of the IEEE/CVF0International0Conference0on0Computer0Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4092-4097} }
Rate-Distortion Limits for Multimodal Retrieval: Theory, Optimal Codes, and Finite-Sample Guarantees-
[pdf]
[arXiv]
[bibtex]@InProceedings{Chen_2025_ICCV, author = {Chen, Thomas Y.}, title = {Rate-Distortion Limits for Multimodal Retrieval: Theory, Optimal Codes, and Finite-Sample Guarantees}, booktitle = {Proceedings of the IEEE/CVF0International0Conference0on0Computer0Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4147-4156} }
IRR-LMM: Improving On-demand Retail Recommendation with Large Multi-Modal Models-
[pdf]
[bibtex]@InProceedings{Zhao_2025_ICCV, author = {Zhao, Yihao and Lai, Nan and Li, Xiaoming and Yan, Xu and Deng, Wenhao and Huang, Hujiang and Zhang, Shuai and Lin, Wei}, title = {IRR-LMM: Improving On-demand Retail Recommendation with Large Multi-Modal Models}, booktitle = {Proceedings of the IEEE/CVF0International0Conference0on0Computer0Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4127-4136} }
MIND-RAG: Multimodal Context-Aware and Intent-Aware Retrieval-Augmented Generation for Educational Publications-
[pdf]
[bibtex]@InProceedings{Yu_2025_ICCV, author = {Yu, Jiayang and Xie, Yuxi and Zhang, Guixuan and Liu, Jie and Zeng, Zhi and Huang, Ying and Zhang, Shuwu}, title = {MIND-RAG: Multimodal Context-Aware and Intent-Aware Retrieval-Augmented Generation for Educational Publications}, booktitle = {Proceedings of the IEEE/CVF0International0Conference0on0Computer0Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4157-4164} }
Towards Reporting Bias in Visual-Language Datasets: Bi-modal Data Augmentation by Decoupling Object-Attribute Association-
[pdf]
[bibtex]@InProceedings{Wu_2025_ICCV, author = {Wu, Qiyu and Zhao, Mengjie and He, Yutong and Huang, Lang and Ono, Junya and Wakaki, Hiromi and Mitsufuji, Yuki}, title = {Towards Reporting Bias in Visual-Language Datasets: Bi-modal Data Augmentation by Decoupling Object-Attribute Association}, booktitle = {Proceedings of the IEEE/CVF0International0Conference0on0Computer0Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4098-4107} }
Document Haystack: A Long Context Multimodal Image/Document Understanding Vision LLM Benchmark-
[pdf]
[arXiv]
[bibtex]@InProceedings{Huybrechts_2025_ICCV, author = {Huybrechts, Goeric and Ronanki, Srikanth and Jayanthi, Sai Muralidhar and Fitzgerald, Jack and Veeravanallur, Srinivasan}, title = {Document Haystack: A Long Context Multimodal Image/Document Understanding Vision LLM Benchmark}, booktitle = {Proceedings of the IEEE/CVF0International0Conference0on0Computer0Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4062-4070} }
Global-to-Local or Local-to-Global? Enhancing Image Retrieval with Efficient Local Search and Effective Global Re-ranking-
[pdf]
[arXiv]
[bibtex]@InProceedings{Aiger_2025_ICCV, author = {Aiger, Dror and Cao, Bingyi and Chen, Kaifeng and Araujo, Andre}, title = {Global-to-Local or Local-to-Global? Enhancing Image Retrieval with Efficient Local Search and Effective Global Re-ranking}, booktitle = {Proceedings of the IEEE/CVF0International0Conference0on0Computer0Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4108-4117} }
Med-GRIM: Enhanced Zero-Shot Medical VQA using prompt-embedded Multimodal Graph RAG-
[pdf]
[bibtex]@InProceedings{Madavan_2025_ICCV, author = {Madavan, Rakesh Raj and Kaimal, Akshat and Faisal, Hashim and S, Chandrakala}, title = {Med-GRIM: Enhanced Zero-Shot Medical VQA using prompt-embedded Multimodal Graph RAG}, booktitle = {Proceedings of the IEEE/CVF0International0Conference0on0Computer0Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4081-4091} }